{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example: Improving LLM Chess Ability with Best/Mixture-of-N Sampling\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import asyncio\n",
    "import random\n",
    "from copy import deepcopy\n",
    "from typing import Dict, List, Optional, Tuple\n",
    "from uuid import UUID\n",
    "\n",
    "import altair as alt\n",
    "import chess\n",
    "import chess.engine\n",
    "import chess.pgn\n",
    "import pandas as pd\n",
    "from tensorzero import AsyncTensorZeroGateway\n",
    "from tqdm import tqdm, trange"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We're going to evaluate the following variants in this notebook. Feel free to create your own variants and evaluate them!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "VARIANTS = [\n",
    "    \"baseline\",\n",
    "    \"best_of_5\",\n",
    "    \"mixture_of_5\",\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's load the training set but truncate it to 1000 examples.\n",
    "\n",
    "For a quick trial, you can use `NUM_EXAMPLES = 10`.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_EXAMPLES = 1000\n",
    "\n",
    "puzzle_df = pd.read_csv(\"data/lichess_easy_puzzles_train.csv\")\n",
    "puzzle_df = puzzle_df.head(NUM_EXAMPLES)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll try to solve the puzzles concurrently to speed up the evaluation.\n",
    "Reduce the value below if you're getting rate-limited by the API providers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_CONCURRENT_T0_REQUESTS = 100\n",
    "\n",
    "semaphore = asyncio.Semaphore(MAX_CONCURRENT_T0_REQUESTS)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's initialize the client for the TensorZero Gateway."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t0 = await AsyncTensorZeroGateway.build_http(gateway_url=\"http://localhost:3000\", timeout=60)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Helper Functions: Solving Chess Puzzles"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we define a helper function to predict the next move for a given variant."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "async def predict_next_move(\n",
    "    board: chess.Board, variant_name: str, episode_id: Optional[UUID] = None\n",
    ") -> Tuple[str, Optional[UUID]]:\n",
    "    \"\"\"\n",
    "    Predicts the next chess move using the TensorZero Gateway.\n",
    "\n",
    "    This function sends the current board state to the TensorZero Gateway and requests\n",
    "    a move prediction using the specified variant. It handles error cases by falling back\n",
    "    to a random legal move when necessary.\n",
    "\n",
    "    Args:\n",
    "        board (chess.Board): The current chess board state.\n",
    "        variant_name (str): The name of the variant to use for prediction (e.g., \"baseline\")\n",
    "        episode_id (Optional[UUID], optional): The episode ID for tracking the conversation\n",
    "                                              across multiple calls. Defaults to None.\n",
    "\n",
    "    Returns:\n",
    "        Tuple[str, Optional[UUID]]:\n",
    "            - str: The predicted move in standard algebraic notation (SAN).\n",
    "            - UUID: Optional episode ID for tracking the puzzle attempt (for feedback)\n",
    "    \"\"\"\n",
    "    # Compute the legal moves in standard algebraic notation (SAN)\n",
    "    legal_moves_san = [board.san(move) for move in board.legal_moves]\n",
    "\n",
    "    # Call the TensorZero Gateway to predict the next move\n",
    "    try:\n",
    "        response = await t0.inference(\n",
    "            function_name=\"play_chess_board\",\n",
    "            input={\n",
    "                \"messages\": [\n",
    "                    {\n",
    "                        \"role\": \"user\",\n",
    "                        \"content\": {\n",
    "                            \"board\": str(board),\n",
    "                            \"color\": \"white\" if board.turn else \"black\",\n",
    "                            \"legal_moves_san\": legal_moves_san,\n",
    "                        },\n",
    "                    }\n",
    "                ]\n",
    "            },\n",
    "            variant_name=variant_name,\n",
    "            episode_id=episode_id,\n",
    "        )\n",
    "    except Exception as e:\n",
    "        print(f\"Error occurred: {type(e).__name__}: {e}\")\n",
    "        return random.choice(legal_moves_san), episode_id\n",
    "\n",
    "    episode_id = response.episode_id\n",
    "\n",
    "    # Try to parse the generated move\n",
    "\n",
    "    if response.output.parsed is None:\n",
    "        print(\"Error: TensorZero returned no parsed output.\")\n",
    "        return random.choice(legal_moves_san), episode_id\n",
    "\n",
    "    move = response.output.parsed.get(\"move\")\n",
    "\n",
    "    if move is None:\n",
    "        print(\"Error: TensorZero returned no move.\")\n",
    "        return random.choice(legal_moves_san), episode_id\n",
    "\n",
    "    return move, response.episode_id"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next we define a helper function to solve a single puzzle."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "async def solve_puzzle(puzzle_data: Dict) -> Tuple[bool, Optional[UUID]]:\n",
    "    \"\"\"\n",
    "    Runs a chess puzzle and checks if the player solves it correctly.\n",
    "\n",
    "    This function simulates a chess puzzle by applying the first move from the puzzle data,\n",
    "    then alternating between player and opponent moves according to the expected sequence.\n",
    "    The player's moves are generated using the predict_next_move function.\n",
    "\n",
    "    Args:\n",
    "        puzzle_data (Dict): A dictionary containing puzzle details including:\n",
    "            - \"FEN\": The FEN notation of the starting position\n",
    "            - \"Moves\": A string of space-separated moves in the expected solution\n",
    "\n",
    "    Returns:\n",
    "        Tuple[bool, Optional[UUID]]:\n",
    "            - bool: True if the player solves the puzzle correctly, False otherwise\n",
    "            - UUID: Optional episode ID for tracking the puzzle attempt (for feedback)\n",
    "    \"\"\"\n",
    "\n",
    "    # Extract puzzle details from puzzle_data\n",
    "    fen = puzzle_data.get(\"FEN\")\n",
    "    expected_moves = puzzle_data.get(\"Moves\", \"\").split()\n",
    "    board = chess.Board(fen)\n",
    "    move_index = 0\n",
    "    total_moves = len(expected_moves)\n",
    "\n",
    "    # Apply the first move before starting the puzzle (as expected by the benchmark)\n",
    "    first_move = expected_moves[move_index]\n",
    "    first_move_obj = board.parse_san(first_move)\n",
    "    board.push(first_move_obj)\n",
    "    move_index = 1\n",
    "\n",
    "    # Determine player's color based on the updated position\n",
    "    player_color = board.turn  # True for White, False for Black\n",
    "    episode_id = None\n",
    "\n",
    "    while move_index < total_moves and not board.is_game_over():\n",
    "        if board.turn == player_color:  # Player's move\n",
    "            async with semaphore:\n",
    "                player_move_san, episode_id = await predict_next_move(deepcopy(board), variant_name, episode_id)\n",
    "\n",
    "            expected_move = expected_moves[move_index]\n",
    "\n",
    "            try:\n",
    "                player_move_obj = board.parse_san(player_move_san)\n",
    "            except ValueError:\n",
    "                return False, episode_id\n",
    "\n",
    "            try:\n",
    "                expected_move_obj = board.parse_san(expected_move)\n",
    "            except ValueError:\n",
    "                expected_move_obj = chess.Move.from_uci(expected_move)\n",
    "\n",
    "            if board.is_checkmate():\n",
    "                return True, episode_id\n",
    "\n",
    "            if player_move_obj != expected_move_obj:\n",
    "                return False, episode_id\n",
    "\n",
    "            board.push(player_move_obj)\n",
    "        else:  # Opponent's move\n",
    "            expected_move = expected_moves[move_index]\n",
    "            opponent_move_obj = board.parse_san(expected_move)\n",
    "\n",
    "            board.push(opponent_move_obj)\n",
    "\n",
    "        move_index += 1\n",
    "\n",
    "    if move_index == total_moves:\n",
    "        return True, episode_id\n",
    "    else:\n",
    "        return False, episode_id"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally, we define a function to solve all the puzzles in the dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "async def solve_puzzles(\n",
    "    puzzle_df: pd.DataFrame,\n",
    "    variant_name: str,\n",
    ") -> List[bool]:\n",
    "    \"\"\"\n",
    "    Solves a batch of chess puzzles concurrently and optionally sends feedback.\n",
    "\n",
    "    This function processes a dataframe of chess puzzles, attempting to solve each one\n",
    "    using the specified variant. It runs the puzzles concurrently to improve throughput.\n",
    "\n",
    "    Args:\n",
    "        puzzle_df (pd.DataFrame): DataFrame containing chess puzzles with FEN positions and expected moves.\n",
    "        variant_name (str): The name of the variant to use for prediction (e.g., \"baseline\", \"best_of_5\").\n",
    "\n",
    "    Returns:\n",
    "        List[bool]: A list of boolean values indicating success (True) or failure (False) for each puzzle.\n",
    "    \"\"\"\n",
    "\n",
    "    successes = []\n",
    "    episode_ids = []\n",
    "    num_successes = 0\n",
    "    total_puzzles = len(puzzle_df)\n",
    "    progress_bar = trange(\n",
    "        total_puzzles,\n",
    "        desc=f\"[Inference] {variant_name}\",\n",
    "    )\n",
    "\n",
    "    tasks = [asyncio.create_task(solve_puzzle(puzzle_df.iloc[i].to_dict())) for i in range(total_puzzles)]\n",
    "\n",
    "    for task in asyncio.as_completed(tasks):\n",
    "        success, episode_id = await task\n",
    "        successes.append(success)\n",
    "        episode_ids.append(episode_id)\n",
    "        if success:\n",
    "            num_successes += 1\n",
    "        current = len(successes)\n",
    "        progress_bar.update(1)\n",
    "        progress_bar.set_postfix(\n",
    "            {\"Success\": f\"{num_successes}/{current}\"},\n",
    "            refresh=True,\n",
    "        )\n",
    "    progress_bar.close()\n",
    "\n",
    "    for success, episode_id in tqdm(\n",
    "        zip(successes, episode_ids),\n",
    "        total=len(successes),\n",
    "        desc=f\"[Feedback] {variant_name}\",\n",
    "    ):\n",
    "        if episode_id:\n",
    "            async with semaphore:\n",
    "                await t0.feedback(\n",
    "                    episode_id=episode_id,\n",
    "                    metric_name=\"puzzle_success\",\n",
    "                    value=success,\n",
    "                )\n",
    "\n",
    "    return successes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate the Variants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = {}\n",
    "\n",
    "for variant_name in VARIANTS:\n",
    "    results[variant_name] = await solve_puzzles(\n",
    "        puzzle_df,\n",
    "        variant_name,\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plot Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Format the results for plotting\n",
    "results_df = pd.DataFrame(results).stack().reset_index()\n",
    "results_df.columns = [\"Puzzle\", \"Variant\", \"Success Rate\"]\n",
    "results_df[\"Mean Success Rate by Variant\"] = results_df.groupby(\"Variant\")[\"Success Rate\"].transform(\"mean\")\n",
    "results_df[\"Variant\"] = results_df.apply(\n",
    "    lambda row: f\"{row['Variant']} ({row['Mean Success Rate by Variant'] * 100:.1f}%)\",\n",
    "    axis=1,\n",
    ")\n",
    "\n",
    "# Plot the results\n",
    "chart = (\n",
    "    alt.Chart(results_df)\n",
    "    .encode(\n",
    "        x=alt.X(\n",
    "            \"mean(Success Rate):Q\",\n",
    "            axis=alt.Axis(format=\"%\"),\n",
    "            scale=alt.Scale(domain=[0, 1]),\n",
    "        ),\n",
    "        y=alt.Y(\"Variant:N\", sort=None),\n",
    "    )\n",
    "    .mark_bar()\n",
    ")\n",
    "\n",
    "error_bars = (\n",
    "    alt.Chart(results_df)\n",
    "    .mark_errorbar(extent=\"ci\")\n",
    "    .encode(\n",
    "        x=alt.X(\"Success Rate:Q\"),\n",
    "        y=alt.Y(\"Variant:N\"),\n",
    "    )\n",
    ")\n",
    "\n",
    "(chart + error_bars).properties(title=\"Success Rate by Variant\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
